.libPaths(c("/hpc/packages/minerva-centos7/rpackages/4.2.0/site-library", "/hpc/packages/minerva-centos7/rpackages/bioconductor/3.15", .libPaths()))
library(sctransform, lib.loc = "/hpc/users/richaa21/.Rlib")
library(Seurat, lib.loc = "/hpc/users/richaa21/.Rlib")
## Loading required package: SeuratObject
## Loading required package: sp
## 'SeuratObject' was built under R 4.2.0 but the current version is
## 4.3.0; it is recomended that you reinstall 'SeuratObject' as the ABI
## for R may have changed
## 'SeuratObject' was built with package 'Matrix' 1.6.4 but the current
## version is 1.6.5; it is recomended that you reinstall 'SeuratObject' as
## the ABI for 'Matrix' may have changed
##
## Attaching package: 'SeuratObject'
## The following object is masked from 'package:base':
##
## intersect
library(patchwork)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
pbmc.data <- Read10X(data.dir = "/sc/arion/projects/ad-omics/ashley/data/Unstim/outs/per_sample_outs/Unstim/count/sample_filtered_feature_bc_matrix")
pbmc.unstim <- CreateSeuratObject(counts = pbmc.data, project = "pbmc_sc", min.cells = 3, min.features = 200)
pbmc.unstim
## An object of class Seurat
## 24998 features across 21114 samples within 1 assay
## Active assay: RNA (24998 features, 0 variable features)
## 1 layer present: counts
Lets check how many cells and features we are starting with.
length(colnames(pbmc.unstim)) #number of cells
## [1] 21114
length(rownames(pbmc.unstim)) #number of features
## [1] 24998
a. QC mitochondiral genes. The PercentFeatureSet() calculates the % of counts originating from a set of features - here we are first looking at mitochondiral features, which are genes starting with MT.
b. By using [[ ]], I am adding columns to the pbmc matrix to store this QC data.
pbmc.unstim[["percent.mt"]] <- PercentageFeatureSet(pbmc.unstim, pattern = "^MT-")
# Show QC metrics for the first 5 cells
head(pbmc.unstim@meta.data, 5)
## orig.ident nCount_RNA nFeature_RNA percent.mt
## AAACCTGAGAAACGCC-1 pbmc_sc 16199 3729 1.524785
## AAACCTGAGAAGGGTA-1 pbmc_sc 12545 4071 4.161020
## AAACCTGAGACCTAGG-1 pbmc_sc 3125 1519 3.392000
## AAACCTGAGATAGTCA-1 pbmc_sc 10924 3523 4.137679
## AAACCTGAGCAAATCA-1 pbmc_sc 10657 3697 6.615370
library(farver, lib.loc = "/hpc/packages/minerva-centos7/rpackages/4.2.0/site-library")
VlnPlot(pbmc.unstim, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
plot1 <- FeatureScatter(pbmc.unstim, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(pbmc.unstim, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1
plot2
# I will select for %mt under 10% and features greater than 200 to capture alive healthy cells.
pbmc.unstim <- subset(pbmc.unstim, subset = nFeature_RNA > 200 & percent.mt < 10)
head(pbmc.unstim@meta.data)
## orig.ident nCount_RNA nFeature_RNA percent.mt
## AAACCTGAGAAACGCC-1 pbmc_sc 16199 3729 1.524785
## AAACCTGAGAAGGGTA-1 pbmc_sc 12545 4071 4.161020
## AAACCTGAGACCTAGG-1 pbmc_sc 3125 1519 3.392000
## AAACCTGAGATAGTCA-1 pbmc_sc 10924 3523 4.137679
## AAACCTGAGCAAATCA-1 pbmc_sc 10657 3697 6.615370
## AAACCTGAGCAGGCTA-1 pbmc_sc 32387 6522 4.421527
The data is normalized based on the feature (number of genes in a cell) by the total expression. This number is multiplied by 10,000 and then log transformed. The function to do this is “NormalizeData.” The values specied below are the default values of this function.
pbmc.unstim <- NormalizeData(pbmc.unstim, normalization.method = "LogNormalize", scale.factor = 10000)
## Normalizing layer: counts
pbmc.unstim <- FindVariableFeatures(pbmc.unstim, selection.method = "vst", nfeatures = 2000)
## Finding variable features for layer counts
# Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(pbmc.unstim), 10)
# plot variable features with and without labels
plot1 <- VariableFeaturePlot(pbmc.unstim)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE, xnudge = 0, ynudge = 0)
plot1
plot2
## Scaling the Data The data is scaled by doing a linear transformation.
The ScaleData function does this by shifting the expression of genes so
that the mean expression becomes 0 and the variance is 1. By default,
only variable genes are scaled. This is changed by features =
all.genes
##Scaling RNA data, we only scale the variable features here for efficiency
all.genes <- rownames(pbmc.unstim)
pbmc.unstim <- ScaleData(pbmc.unstim, vars.to.regress = c("percent.mt"))
## Regressing out percent.mt
## Centering and scaling data matrix
For the first principal components, RunPCA shows the most positive (correlated) and most negative (anticorrelated) genes
pbmc.unstim <- RunPCA(pbmc.unstim, features = VariableFeatures(object = pbmc.unstim))
## PC_ 1
## Positive: IL7R, FP236383.3, SQSTM1, LTB, TCF7, AQP3, JAML, FP671120.4, HERPUD1, PIM2
## CD27, TIMP1, CCR7, TNFSF8, NCDN, CD40LG, BEX3, C1orf162, LINC00861, SNED1
## CCR4, IGFLR1, TRBV6-2, RCBTB2, MYC, TRAV8-3, FAAH2, WDR86-AS1, HSF4, LRRN3
## Negative: RRM2, MKI67, TYMS, TUBA1B, TOP2A, STMN1, UBE2C, PCLAF, CDK1, KIFC1
## ASF1B, NUSAP1, ZWINT, CCNA2, CENPF, ASPM, KNL1, HIST1H1B, GTSE1, TPX2
## SPC25, PKMYT1, TUBB, AURKB, CDCA8, CDCA2, BIRC5, CKAP2L, HJURP, GGH
## PC_ 2
## Positive: KLRD1, TYROBP, CTSW, FCER1G, PRF1, GZMB, NKG7, CCL3, CCL4, KLRC1
## IL2RB, SH2D1B, FCGR3A, PLEK, KLRC2, NCAM1, CST7, EOMES, KIR2DL4, AOAH
## CCL5, TRDC, GNLY, GZMA, NCR1, FASLG, TOX, SLAMF7, CD38, KLRK1
## Negative: IL7R, TIMP1, S100A6, AQP3, IL32, COTL1, S100A4, SNED1, WDR86-AS1, ANP32B
## STAT1, IL9R, TUBA4A, ITGB1, MYC, PLK1, CCNB2, LMNA, DLGAP5, CENPA
## SOS1, TUBA1C, HMMR, TCF7, PLP2, CRIP1, CEP55, CENPF, CDCA3, PRDX1
## PC_ 3
## Positive: GINS2, MCM2, CDC6, MCM6, MCM5, MCM7, MCM4, CDC45, E2F1, MCM3
## DTL, PAICS, UNG, FAM111B, FEN1, UHRF1, SLBP, HSP90AB1, CCNE2, PCNA
## HELLS, CHAF1B, CDCA7, SLC29A1, MCM10, CLSPN, NME1, MIF, DUT, CTNNAL1
## Negative: PLK1, CCNB1, KIF20A, CDC20, CENPA, NEK2, HMMR, CCNB2, DLGAP5, PSRC1
## ASPM, KIF14, TROAP, CENPF, CENPE, PIF1, AURKA, PIMREG, ARL6IP1, KIF2C
## UBALD2, PRR11, KNSTRN, GZMK, DEPDC1, CDCA8, UBE2C, KPNA2, TPX2, BIRC5
## PC_ 4
## Positive: CD79A, JCHAIN, POU2AF1, IGHM, BASP1, BLNK, MEF2C, SYK, TSPAN33, TCF4
## RALGPS2, IGLC2, SPI1, RHEX, CD40, IGHA1, BANK1, NCF1, CDK14, CD19
## LINC00926, FCRL5, LINC02362, BTK, TNFRSF17, AC012236.1, IGKC, SLC43A2, RASGRP3, IGLC3
## Negative: S100A4, CCL5, LTB, S100A6, IL32, GZMA, PFN1, CST7, HOPX, NKG7
## CD8A, CD8B, ALOX5AP, GZMM, FGFBP2, THEMIS, LYAR, GZMK, ACTB, PRF1
## FLNA, TMSB10, GZMB, LAG3, CFL1, SAMD3, AHNAK, CXCR3, ANXA2, NCR3
## PC_ 5
## Positive: LGALS1, S100A4, S100A6, IL32, ANXA2, TXN, HLA-DPA1, GAPDH, TMSB10, HLA-DRB1
## PFN1, CD74, ACTB, HLA-DQA1, HLA-DRA, CFL1, HOPX, PRDX1, HLA-DPB1, ACTG1
## ENO1, FTL, AHNAK, LINC02694, LGALS3, VIM, COTL1, JPT1, LAG3, FLNA
## Negative: TCF7, CCR7, TXK, PLAC8, FP236383.3, FCER1G, CXCR4, PTMA, ID3, NCAM1
## TYROBP, HIST1H1D, PTGDR, SELL, BACH2, C1orf162, SH2D1B, SESN3, KCNQ5, FHIT
## CTBP2, FAM111B, KLRF1, E2F7, RNF130, AC139720.1, EXO1, MYBL2, E2F8, ESCO2
print(pbmc.unstim[["pca"]], dims = 1:5, nfeatures = 5)
## PC_ 1
## Positive: IL7R, FP236383.3, SQSTM1, LTB, TCF7
## Negative: RRM2, MKI67, TYMS, TUBA1B, TOP2A
## PC_ 2
## Positive: KLRD1, TYROBP, CTSW, FCER1G, PRF1
## Negative: IL7R, TIMP1, S100A6, AQP3, IL32
## PC_ 3
## Positive: GINS2, MCM2, CDC6, MCM6, MCM5
## Negative: PLK1, CCNB1, KIF20A, CDC20, CENPA
## PC_ 4
## Positive: CD79A, JCHAIN, POU2AF1, IGHM, BASP1
## Negative: S100A4, CCL5, LTB, S100A6, IL32
## PC_ 5
## Positive: LGALS1, S100A4, S100A6, IL32, ANXA2
## Negative: TCF7, CCR7, TXK, PLAC8, FP236383.3
The PCA results can be visualized in different ways.
VizDimLoadings(pbmc.unstim, dims = 1:2, reduction = "pca")
DimPlot(pbmc.unstim, reduction = "pca")
DimHeatmap(pbmc.unstim, dims = 1, cells = 500, balanced = TRUE)
DimHeatmap(pbmc.unstim, dims = 1:15, cells = 500, balanced = TRUE)
Cells will be clustered based on PCA. How many PC to use is dependent on many factors. For example, if trying to analyze a rare cell subset, you might want to add more PCs. Usually, the first 10 is good to see dimensionality of the data.
ElbowPlot(pbmc.unstim)
##We select the top 15 PCs for clustering and tSNE based on PCElbowPlot
pbmc.unstim <- FindNeighbors(pbmc.unstim, reduction = "pca", dims = 1:15)
## Computing nearest neighbor graph
## Computing SNN
pbmc.unstim <- FindClusters(pbmc.unstim, resolution = 0.5, verbose = FALSE)
head(Idents(pbmc.unstim), 5)
## AAACCTGAGAAACGCC-1 AAACCTGAGAAGGGTA-1 AAACCTGAGACCTAGG-1 AAACCTGAGATAGTCA-1
## 10 2 6 0
## AAACCTGAGCAAATCA-1
## 1
## Levels: 0 1 2 3 4 5 6 7 8 9 10 11
pbmc.unstim <- RunUMAP(pbmc.unstim, dims = 1:15)
## 10:15:59 UMAP embedding parameters a = 0.9922 b = 1.112
## 10:15:59 Read 20924 rows and found 15 numeric columns
## 10:15:59 Using Annoy for neighbor search, n_neighbors = 30
## 10:15:59 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 10:16:01 Writing NN index file to temp file /tmp/RtmpA4NXVY/filebc682171bd0c
## 10:16:01 Searching Annoy index using 1 thread, search_k = 3000
## 10:16:08 Annoy recall = 100%
## 10:16:09 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 10:16:10 Initializing from normalized Laplacian + noise (using RSpectra)
## 10:16:10 Commencing optimization for 200 epochs, with 892462 positive edges
## 10:16:36 Optimization finished
# note that you can set `label = TRUE` or use the LabelClusters function to help label
# individual clusters
DimPlot(pbmc.unstim, reduction = "umap", label = TRUE)
pbmc.unstim <- RunTSNE(pbmc.unstim, reduction = "pca", dims = 1:15)
DimPlot(pbmc.unstim, reduction = "tsne", label = TRUE)
Lets check our metadata now of the seurat object to see what has been added.
head(pbmc.unstim@meta.data)
## orig.ident nCount_RNA nFeature_RNA percent.mt
## AAACCTGAGAAACGCC-1 pbmc_sc 16199 3729 1.524785
## AAACCTGAGAAGGGTA-1 pbmc_sc 12545 4071 4.161020
## AAACCTGAGACCTAGG-1 pbmc_sc 3125 1519 3.392000
## AAACCTGAGATAGTCA-1 pbmc_sc 10924 3523 4.137679
## AAACCTGAGCAAATCA-1 pbmc_sc 10657 3697 6.615370
## AAACCTGAGCAGGCTA-1 pbmc_sc 32387 6522 4.421527
## RNA_snn_res.0.5 seurat_clusters
## AAACCTGAGAAACGCC-1 10 10
## AAACCTGAGAAGGGTA-1 2 2
## AAACCTGAGACCTAGG-1 6 6
## AAACCTGAGATAGTCA-1 0 0
## AAACCTGAGCAAATCA-1 1 1
## AAACCTGAGCAGGCTA-1 3 3
#We now have seurat clusters and RNA_snn_res.0.5 columns added.
# find markers for every cluster compared to all remaining cells, report only the positive
# ones
markers_unstim <- FindAllMarkers(pbmc.unstim, only.pos = TRUE)
## Calculating cluster 0
## Calculating cluster 1
## Calculating cluster 2
## Calculating cluster 3
## Calculating cluster 4
## Calculating cluster 5
## Calculating cluster 6
## Calculating cluster 7
## Calculating cluster 8
## Calculating cluster 9
## Calculating cluster 10
## Calculating cluster 11
VlnPlot will display the differential expression across the clusters. For example, I am looking here at CD8A and CD4 expression in the clusters.
VlnPlot(pbmc.unstim, features = c("CD8A", "CD4"))
The raw counts can also be shown instead by adding some parameters.
VlnPlot(pbmc.unstim, features = c("CD8A", "CD4"), slot = "counts", log = TRUE)
FeaturePlot(pbmc.unstim, features = c("MS4A1", "GNLY", "CD3E", "CD14", "FCER1A", "FCGR3A", "LYZ", "PPBP",
"CD8A"))
.libPaths(c("/hpc/packages/minerva-centos7/rpackages/4.2.0/site-library", "/hpc/packages/minerva-centos7/rpackages/bioconductor/3.15", .libPaths()))
library(tidyr)
u_demuxlet = read.delim("/sc/arion/projects/ad-omics/ashley/data/Unstim.best", header = T, stringsAsFactors = F, check.names = F)
head(u_demuxlet)
## BARCODE RD.TOTL RD.PASS RD.UNIQ N.SNP
## 1 AAACCTGAGAAACGCC-1 42286 22364 20626 1854
## 2 AAACCTGAGAAGGGTA-1 35144 4231 3579 1888
## 3 AAACCTGAGACCTAGG-1 8284 1119 928 608
## 4 AAACCTGAGATAGTCA-1 31640 3692 3025 1681
## 5 AAACCTGAGCAAATCA-1 34136 4239 3322 1982
## 6 AAACCTGAGCAGGCTA-1 93780 12646 10291 4636
## BEST SNG.1ST
## 1 SNG-GSA8_0_NYUMD0315-01 GSA8_0_NYUMD0315-01
## 2 DBL-GSA8_0_NYUMD0315-01-GSA8_0_NYUMD0327-01-0.500 GSA8_0_NYUMD0315-01
## 3 SNG-GSA8_0_NYUMD0334-01 GSA8_0_NYUMD0334-01
## 4 SNG-GSA8_0_NYUMD0289-01 GSA8_0_NYUMD0289-01
## 5 SNG-GSA8_0_NYUMD0334-01 GSA8_0_NYUMD0334-01
## 6 DBL-GSA8_0_NYUMD0327-01-GSA8_0_NYUMD0334-01-0.500 GSA8_0_NYUMD0327-01
## SNG.LLK1 SNG.2ND SNG.LLK2 SNG.LLK0 DBL.1ST
## 1 -632.7918 GSA8_0_NYUMD0289-01 -1794.0976 -1099.8520 GSA8_0_NYUMD0289-01
## 2 -1267.1772 GSA8_0_NYUMD0327-01 -1376.1579 -1201.1205 GSA8_0_NYUMD0315-01
## 3 -220.6593 GSA8_0_NYUMD0327-01 -499.5366 -330.9340 GSA8_0_NYUMD0327-01
## 4 -601.2027 GSA8_0_NYUMD0327-01 -1580.6534 -975.7141 GSA8_0_NYUMD0289-01
## 5 -695.9992 GSA8_0_NYUMD0315-01 -1791.4493 -1106.4335 GSA8_0_NYUMD0315-01
## 6 -2584.2428 GSA8_0_NYUMD0334-01 -3897.3665 -2888.3866 GSA8_0_NYUMD0327-01
## DBL.2ND ALPHA LLK12 LLK1 LLK2 LLK10
## 1 GSA8_0_NYUMD0315-01 0.5 -1024.5246 -1794.0976 -632.7918 -1840.7324
## 2 GSA8_0_NYUMD0327-01 0.5 -885.9914 -1267.1772 -1376.1579 -1211.0138
## 3 GSA8_0_NYUMD0334-01 0.5 -301.3090 -499.5366 -220.6593 -458.4461
## 4 GSA8_0_NYUMD0315-01 0.5 -900.3046 -601.2027 -1603.4487 -610.6523
## 5 GSA8_0_NYUMD0334-01 0.5 -992.4663 -1791.4493 -695.9992 -1561.3775
## 6 GSA8_0_NYUMD0334-01 0.5 -2274.3541 -2584.2428 -3897.3665 -2729.2507
## LLK20 LLK00 PRB.DBL PRB.SNG1
## 1 -1024.5246 -1155.0510 2.49e-171 1
## 2 -1279.7057 -1088.3092 1.00e+00 1
## 3 -313.0930 -338.7091 3.14e-36 1
## 4 -900.3046 -996.4076 4.23e-131 1
## 5 -1019.5818 -1123.0042 5.87e-130 1
## 6 -3366.3864 -2648.3299 1.00e+00 1
To edit: I will split the Best column into multiple columns.
u_demuxlet_edit = u_demuxlet %>%
mutate(BEST = gsub("-01","", BEST)) %>%
separate(BEST, into=c("DMX_classification.global","DMX_maxID","DMX_secondID"), sep="-") %>%
separate(DMX_maxID, into=c("DMX_garbage1","DMX_garbage2","DMX_maxID"), sep ="_") %>%
separate(DMX_secondID, into=c("DMX_garbage3","DMX_garbage4","DMX_secondID"), sep ="_") %>%
select(-contains("garbage"))
## Warning: Expected 3 pieces. Additional pieces discarded in 3082 rows [2, 6, 8,
## 11, 15, 23, 24, 29, 30, 31, 34, 42, 54, 58, 59, 60, 68, 75, 76, 83, ...].
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 18152 rows [1,
## 3, 4, 5, 7, 9, 10, 12, 13, 14, 16, 17, 18, 19, 20, 21, 22, 25, 26, 27, ...].
head(u_demuxlet_edit)
## BARCODE RD.TOTL RD.PASS RD.UNIQ N.SNP DMX_classification.global
## 1 AAACCTGAGAAACGCC-1 42286 22364 20626 1854 SNG
## 2 AAACCTGAGAAGGGTA-1 35144 4231 3579 1888 DBL
## 3 AAACCTGAGACCTAGG-1 8284 1119 928 608 SNG
## 4 AAACCTGAGATAGTCA-1 31640 3692 3025 1681 SNG
## 5 AAACCTGAGCAAATCA-1 34136 4239 3322 1982 SNG
## 6 AAACCTGAGCAGGCTA-1 93780 12646 10291 4636 DBL
## DMX_maxID DMX_secondID SNG.1ST SNG.LLK1 SNG.2ND
## 1 NYUMD0315 <NA> GSA8_0_NYUMD0315-01 -632.7918 GSA8_0_NYUMD0289-01
## 2 NYUMD0315 NYUMD0327 GSA8_0_NYUMD0315-01 -1267.1772 GSA8_0_NYUMD0327-01
## 3 NYUMD0334 <NA> GSA8_0_NYUMD0334-01 -220.6593 GSA8_0_NYUMD0327-01
## 4 NYUMD0289 <NA> GSA8_0_NYUMD0289-01 -601.2027 GSA8_0_NYUMD0327-01
## 5 NYUMD0334 <NA> GSA8_0_NYUMD0334-01 -695.9992 GSA8_0_NYUMD0315-01
## 6 NYUMD0327 NYUMD0334 GSA8_0_NYUMD0327-01 -2584.2428 GSA8_0_NYUMD0334-01
## SNG.LLK2 SNG.LLK0 DBL.1ST DBL.2ND ALPHA
## 1 -1794.0976 -1099.8520 GSA8_0_NYUMD0289-01 GSA8_0_NYUMD0315-01 0.5
## 2 -1376.1579 -1201.1205 GSA8_0_NYUMD0315-01 GSA8_0_NYUMD0327-01 0.5
## 3 -499.5366 -330.9340 GSA8_0_NYUMD0327-01 GSA8_0_NYUMD0334-01 0.5
## 4 -1580.6534 -975.7141 GSA8_0_NYUMD0289-01 GSA8_0_NYUMD0315-01 0.5
## 5 -1791.4493 -1106.4335 GSA8_0_NYUMD0315-01 GSA8_0_NYUMD0334-01 0.5
## 6 -3897.3665 -2888.3866 GSA8_0_NYUMD0327-01 GSA8_0_NYUMD0334-01 0.5
## LLK12 LLK1 LLK2 LLK10 LLK20 LLK00 PRB.DBL
## 1 -1024.5246 -1794.0976 -632.7918 -1840.7324 -1024.5246 -1155.0510 2.49e-171
## 2 -885.9914 -1267.1772 -1376.1579 -1211.0138 -1279.7057 -1088.3092 1.00e+00
## 3 -301.3090 -499.5366 -220.6593 -458.4461 -313.0930 -338.7091 3.14e-36
## 4 -900.3046 -601.2027 -1603.4487 -610.6523 -900.3046 -996.4076 4.23e-131
## 5 -992.4663 -1791.4493 -695.9992 -1561.3775 -1019.5818 -1123.0042 5.87e-130
## 6 -2274.3541 -2584.2428 -3897.3665 -2729.2507 -3366.3864 -2648.3299 1.00e+00
## PRB.SNG1
## 1 1
## 2 1
## 3 1
## 4 1
## 5 1
## 6 1
table(u_demuxlet_edit$DMX_classification.global) #num of singlets and doublets
##
## AMB DBL SNG
## 25 3057 18152
table(u_demuxlet_edit$DMX_maxID) #number of cells identified as each donor
##
## NYUMD0289 NYUMD0315 NYUMD0327 NYUMD0334
## 4904 5661 5475 5194
table(u_demuxlet_edit[,c("DMX_classification.global","DMX_maxID")]) #number of singlets or doublets identified as each donor
## DMX_maxID
## DMX_classification.global NYUMD0289 NYUMD0315 NYUMD0327 NYUMD0334
## AMB 4 5 5 11
## DBL 1301 1131 608 17
## SNG 3599 4525 4862 5166
u_demuxlet_edit.subset <- u_demuxlet_edit[u_demuxlet_edit$BARCODE %in% colnames(pbmc.unstim),]
pbmc.unstim@meta.data <- cbind(pbmc.unstim@meta.data,u_demuxlet_edit.subset$DMX_maxID,u_demuxlet_edit.subset$DMX_classification.global, u_demuxlet_edit.subset$BARCODE)
head(pbmc.unstim@meta.data)
## orig.ident nCount_RNA nFeature_RNA percent.mt
## AAACCTGAGAAACGCC-1 pbmc_sc 16199 3729 1.524785
## AAACCTGAGAAGGGTA-1 pbmc_sc 12545 4071 4.161020
## AAACCTGAGACCTAGG-1 pbmc_sc 3125 1519 3.392000
## AAACCTGAGATAGTCA-1 pbmc_sc 10924 3523 4.137679
## AAACCTGAGCAAATCA-1 pbmc_sc 10657 3697 6.615370
## AAACCTGAGCAGGCTA-1 pbmc_sc 32387 6522 4.421527
## RNA_snn_res.0.5 seurat_clusters
## AAACCTGAGAAACGCC-1 10 10
## AAACCTGAGAAGGGTA-1 2 2
## AAACCTGAGACCTAGG-1 6 6
## AAACCTGAGATAGTCA-1 0 0
## AAACCTGAGCAAATCA-1 1 1
## AAACCTGAGCAGGCTA-1 3 3
## u_demuxlet_edit.subset$DMX_maxID
## AAACCTGAGAAACGCC-1 NYUMD0315
## AAACCTGAGAAGGGTA-1 NYUMD0315
## AAACCTGAGACCTAGG-1 NYUMD0334
## AAACCTGAGATAGTCA-1 NYUMD0289
## AAACCTGAGCAAATCA-1 NYUMD0334
## AAACCTGAGCAGGCTA-1 NYUMD0327
## u_demuxlet_edit.subset$DMX_classification.global
## AAACCTGAGAAACGCC-1 SNG
## AAACCTGAGAAGGGTA-1 DBL
## AAACCTGAGACCTAGG-1 SNG
## AAACCTGAGATAGTCA-1 SNG
## AAACCTGAGCAAATCA-1 SNG
## AAACCTGAGCAGGCTA-1 DBL
## u_demuxlet_edit.subset$BARCODE
## AAACCTGAGAAACGCC-1 AAACCTGAGAAACGCC-1
## AAACCTGAGAAGGGTA-1 AAACCTGAGAAGGGTA-1
## AAACCTGAGACCTAGG-1 AAACCTGAGACCTAGG-1
## AAACCTGAGATAGTCA-1 AAACCTGAGATAGTCA-1
## AAACCTGAGCAAATCA-1 AAACCTGAGCAAATCA-1
## AAACCTGAGCAGGCTA-1 AAACCTGAGCAGGCTA-1
The Seurat Object has already been preprocessed in my case, so this should be clean)
pbmc.unstim[["percent.mt"]] <- PercentageFeatureSet(pbmc.unstim, pattern = "^MT-")
library(cowplot)
##
## Attaching package: 'cowplot'
## The following object is masked from 'package:patchwork':
##
## align_plots
# look at distribution of metrics by classification
plot_grid(VlnPlot(pbmc.unstim, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3, group.by = "u_demuxlet_edit.subset$DMX_maxID"))
VlnPlot(pbmc.unstim, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3, group.by = "u_demuxlet_edit.subset$DMX_classification.global")
Now, select for MT content under 10% and for nfeatureRNA > 200 to ensure getting alive cells. I previously did this, so should see no change in cell numnbers.
pbmc.unstim <- subset(pbmc.unstim, subset = nFeature_RNA > 200 & percent.mt < 10)
length(colnames(pbmc.unstim))
## [1] 20924
length(rownames(pbmc.unstim))
## [1] 24998
Add column to meta data to identify seurat object as Basline condition. Also rename some columns for clarity purposes.
pbmc.unstim@meta.data$condition <- 'Baseline'
names(pbmc.unstim@meta.data)[names(pbmc.unstim@meta.data) == "u_demuxlet_edit.subset$DMX_classification.global"] <- "DMX_classification.global"
names(pbmc.unstim@meta.data)[names(pbmc.unstim@meta.data) == "u_demuxlet_edit.subset$DMX_maxID"] <- "DMX_maxID"
names(pbmc.unstim@meta.data)[names(pbmc.unstim@meta.data) == "u_demuxlet_edit.subset$BARCODE"] <- "Barcode"
head(pbmc.unstim@meta.data)
## orig.ident nCount_RNA nFeature_RNA percent.mt
## AAACCTGAGAAACGCC-1 pbmc_sc 16199 3729 1.524785
## AAACCTGAGAAGGGTA-1 pbmc_sc 12545 4071 4.161020
## AAACCTGAGACCTAGG-1 pbmc_sc 3125 1519 3.392000
## AAACCTGAGATAGTCA-1 pbmc_sc 10924 3523 4.137679
## AAACCTGAGCAAATCA-1 pbmc_sc 10657 3697 6.615370
## AAACCTGAGCAGGCTA-1 pbmc_sc 32387 6522 4.421527
## RNA_snn_res.0.5 seurat_clusters DMX_maxID
## AAACCTGAGAAACGCC-1 10 10 NYUMD0315
## AAACCTGAGAAGGGTA-1 2 2 NYUMD0315
## AAACCTGAGACCTAGG-1 6 6 NYUMD0334
## AAACCTGAGATAGTCA-1 0 0 NYUMD0289
## AAACCTGAGCAAATCA-1 1 1 NYUMD0334
## AAACCTGAGCAGGCTA-1 3 3 NYUMD0327
## DMX_classification.global Barcode condition
## AAACCTGAGAAACGCC-1 SNG AAACCTGAGAAACGCC-1 Baseline
## AAACCTGAGAAGGGTA-1 DBL AAACCTGAGAAGGGTA-1 Baseline
## AAACCTGAGACCTAGG-1 SNG AAACCTGAGACCTAGG-1 Baseline
## AAACCTGAGATAGTCA-1 SNG AAACCTGAGATAGTCA-1 Baseline
## AAACCTGAGCAAATCA-1 SNG AAACCTGAGCAAATCA-1 Baseline
## AAACCTGAGCAGGCTA-1 DBL AAACCTGAGCAGGCTA-1 Baseline
saveRDS(pbmc.unstim, file = "/sc/arion/projects/ad-omics/ashley/PD_Stim/pbmc.unstim.final.RDS")